pgdrv <- dbDriver("PostgreSQL")
con <- dbConnect(pgdrv, dbname="dvd",
port="5432",
user="postgres",
password=pd,
host="localhost")
top_rented <- dbGetQuery(con, "SELECT c.name, COUNT(c.name) AS count_genre, SUM(p.amount) AS total_sale FROM payment p
INNER JOIN rental r ON p.rental_id = r.rental_id
INNER JOIN inventory i ON i.inventory_id = r.inventory_id
INNER JOIN film_category fc ON fc.film_id = i.film_id
INNER JOIN category c ON c.category_id = fc.category_id
GROUP BY c.name
ORDER BY count_genre DESC
LIMIT 1")
least_rented <- dbGetQuery(con, "SELECT c.name, COUNT(c.name) AS count_genre, SUM(p.amount) AS total_sale FROM payment p
INNER JOIN rental r ON p.rental_id = r.rental_id
INNER JOIN inventory i ON i.inventory_id = r.inventory_id
INNER JOIN film_category fc ON fc.film_id = i.film_id
INNER JOIN category c ON c.category_id = fc.category_id
GROUP BY c.name
ORDER BY count_genre ASC
LIMIT 1")
du_genre <- dbGetQuery(con, "SELECT x.name, COUNT(x.name) as num_user FROM (
SELECT DISTINCT c.name, r.customer_id FROM category c
INNER JOIN film_category fc ON c.category_id = fc.category_id
INNER JOIN inventory i ON i.film_id = fc.film_id
INNER JOIN rental r ON r.inventory_id = i.inventory_id) x
GROUP BY x.name
ORDER BY x.name")
ave_ren_rate <- dbGetQuery(con, "SELECT c.name, ROUND(AVG(f.rental_rate),2) AS average_rental_rate FROM category c
INNER JOIN film_category fc ON c.category_id = fc.category_id
INNER JOIN film f ON f.film_id = fc.film_id
GROUP BY c.name
ORDER BY average_rental_rate DESC;")
re_time <- dbGetQuery(con, "SELECT x1.result, COUNT(x1.result) FROM (
SELECT (
CASE WHEN res.rental_duration > res.bw THEN 'early'
WHEN res.rental_duration = res.bw THEN 'on time'
WHEN res.rental_duration < res.bw THEN 'late'
ELSE 'NA'
END) AS result
FROM (SELECT x.*
FROM (
SELECT f.film_id, f.rental_duration, DATE_PART('day',r.return_date - r.rental_date) AS bw
FROM rental r
INNER JOIN inventory i ON i.inventory_id = r.inventory_id
INNER JOIN film f ON f.film_id = i.film_id
) x
) res
) x1
GROUP BY x1.result;")
have_presence <- dbGetQuery(con, "SELECT c.country, COUNT(DISTINCT cus.customer_id) AS customer_base, SUM(p.amount) AS total_sale FROM customer cus
INNER JOIN address a ON cus.address_id = a.address_id
INNER JOIN city ct ON a.city_id = ct.city_id
INNER JOIN country c ON ct.country_id = c.country_id
INNER JOIN payment p ON p.customer_id = cus.customer_id
GROUP BY c.country
ORDER BY total_sale DESC;")
# get DB as a whole for in-depth analysis
category <- dbGetQuery(con, "SELECT category_id, name FROM category")
film_category <- dbGetQuery(con, "SELECT film_id, category_id FROM film_category")
inventory <- dbGetQuery(con, "SELECT film_id, inventory_id FROM inventory")
rental <- dbGetQuery(con, "SELECT rental_id, inventory_id FROM rental")
payment <- dbGetQuery(con, "SELECT payment_id, payment_date, rental_id, amount FROM payment")
film <- dbGetQuery(con, "SELECT film_id, title, rental_duration FROM film")
# get sales and rent of each genre (desc)
genre_rented <- dbGetQuery(con, "SELECT c.name, COUNT(c.name) AS count_genre, SUM(p.amount) AS total_sale FROM payment p
INNER JOIN rental r ON p.rental_id = r.rental_id
INNER JOIN inventory i ON i.inventory_id = r.inventory_id
INNER JOIN film_category fc ON fc.film_id = i.film_id
INNER JOIN category c ON c.category_id = fc.category_id
GROUP BY c.name
ORDER BY count_genre DESC")
top_rented
## name count_genre total_sale
## 1 Sports 1081 4892.19
least_rented
## name count_genre total_sale
## 1 Music 750 3071.52
du_genre
## name num_user
## 1 Action 510
## 2 Animation 500
## 3 Children 482
## 4 Classics 468
## 5 Comedy 495
## 6 Documentary 483
## 7 Drama 501
## 8 Family 501
## 9 Foreign 493
## 10 Games 474
## 11 Horror 451
## 12 Music 447
## 13 New 468
## 14 Sci-Fi 507
## 15 Sports 519
## 16 Travel 442
ave_ren_rate
## name average_rental_rate
## 1 Games 3.25
## 2 Travel 3.24
## 3 Sci-Fi 3.22
## 4 Comedy 3.16
## 5 Sports 3.13
## 6 New 3.12
## 7 Foreign 3.10
## 8 Horror 3.03
## 9 Drama 3.02
## 10 Music 2.95
## 11 Children 2.89
## 12 Animation 2.81
## 13 Family 2.76
## 14 Classics 2.74
## 15 Documentary 2.67
## 16 Action 2.65
re_time
## result count
## 1 NA 183
## 2 late 6403
## 3 on time 1720
## 4 early 7738
have_presence
## country customer_base total_sale
## 1 India 60 6034.78
## 2 China 53 5251.03
## 3 United States 36 3685.31
## 4 Japan 31 3122.51
## 5 Mexico 30 2984.82
## 6 Brazil 28 2919.19
## 7 Russian Federation 28 2765.62
## 8 Philippines 20 2219.70
## 9 Turkey 15 1498.49
## 10 Indonesia 14 1352.69
## 11 Nigeria 13 1314.92
## 12 Argentina 13 1298.80
## 13 Taiwan 10 1155.10
## 14 South Africa 11 1069.46
## 15 Iran 8 877.96
## 16 United Kingdom 9 850.96
## 17 Poland 8 786.16
## 18 Italy 7 753.26
## 19 Germany 7 741.24
## 20 Vietnam 6 676.45
## 21 Ukraine 6 675.53
## 22 Colombia 6 661.54
## 23 Egypt 6 659.48
## 24 Venezuela 7 632.43
## 25 Canada 5 559.70
## 26 Netherlands 5 557.73
## 27 South Korea 5 527.77
## 28 Spain 5 513.80
## 29 Yemen 4 473.93
## 30 Pakistan 5 473.84
## 31 Saudi Arabia 5 452.94
## 32 Peru 4 407.01
## 33 Thailand 3 401.08
## 34 Israel 4 379.13
## 35 Ecuador 3 369.18
## 36 Bangladesh 3 353.19
## 37 Algeria 3 349.18
## 38 France 4 334.12
## 39 Malaysia 3 330.23
## 40 Tanzania 3 322.22
## 41 Mozambique 3 315.25
## 42 United Arab Emirates 3 305.25
## 43 Dominican Republic 3 304.26
## 44 Chile 3 303.34
## 45 Austria 3 284.30
## 46 Morocco 3 274.35
## 47 Paraguay 3 273.40
## 48 Belarus 2 271.36
## 49 Latvia 2 249.43
## 50 Switzerland 3 248.41
## 51 Kenya 2 245.49
## 52 Yugoslavia 2 233.49
## 53 Puerto Rico 2 224.48
## 54 Romania 2 218.42
## 55 Runion 1 211.55
## 56 French Polynesia 2 205.52
## 57 Greece 2 204.54
## 58 Sudan 2 202.51
## 59 Azerbaijan 2 198.53
## 60 Bulgaria 2 194.52
## 61 Kazakstan 2 192.51
## 62 Angola 2 187.55
## 63 Cameroon 2 186.49
## 64 Myanmar 2 179.53
## 65 Cambodia 2 179.51
## 66 Bolivia 2 178.56
## 67 Congo, The Democratic Republic of the 2 168.58
## 68 Oman 2 161.56
## 69 Holy See (Vatican City State) 1 146.68
## 70 Nauru 1 143.70
## 71 Sweden 1 139.67
## 72 Czech Republic 1 132.72
## 73 Moldova 1 127.66
## 74 Turkmenistan 1 126.74
## 75 Chad 1 122.72
## 76 Malawi 1 121.73
## 77 Zambia 1 121.70
## 78 Virgin Islands, U.S. 1 121.69
## 79 Greenland 1 119.72
## 80 Armenia 1 118.75
## 81 Gambia 1 114.73
## 82 Iraq 1 111.73
## 83 Hungary 1 111.71
## 84 Bahrain 1 108.76
## 85 North Korea 1 107.71
## 86 Brunei 1 107.66
## 87 Kuwait 1 106.75
## 88 Estonia 1 105.72
## 89 Hong Kong 1 104.76
## 90 Sri Lanka 1 103.73
## 91 Liechtenstein 1 99.74
## 92 Anguilla 1 99.68
## 93 French Guiana 1 97.80
## 94 Faroe Islands 1 96.76
## 95 Senegal 1 95.76
## 96 Nepal 1 93.83
## 97 Tuvalu 1 93.78
## 98 Madagascar 1 92.79
## 99 Ethiopia 1 91.77
## 100 New Zealand 1 85.77
## 101 Slovakia 1 80.77
## 102 Finland 1 78.79
## 103 Tunisia 1 73.78
## 104 Afghanistan 1 67.82
## 105 Tonga 1 64.84
## 106 Saint Vincent and the Grenadines 1 64.82
## 107 Lithuania 1 63.78
## 108 American Samoa 1 47.85
# get required variables
nt = genre_rented %>% select(name, count_genre)
# draw plot in descending order
nt %>%
plot_ly(x = ~name, y = ~count_genre) %>%
add_bars() %>%
layout(title ="Popularity of Genres (without Adjustment)",
autosize = T,
yaxis = list(title = '# of rents', range = c(600, 1200), showline = TRUE),
xaxis = list(type = 'category', title = 'Genre', categoryorder = "array", categoryarray = ~c("Sports", "Animation", "Action", "Sci-Fi", "Family", "Foreign", "Drama", "Documentary", "Games", "New", "Children", "Classics", "Comedy", "Horror", "Travel", "Music") ))
Question: Sports seems to be the most popular and Music seems to be the least. Is this really so?
# connect category to film_id
cf = merge(x = category, y = film_category, by = "category_id", y.all = TRUE)
# number of disticnt films per category
ci_s = cf %>% group_by(category_id) %>%
summarise(sum = sum(film_id/film_id))
# get variables needed
dfg = merge(x = category, y = ci_s) %>% select(name, sum)
# draw plot in descending order
dfg %>%
plot_ly(x = ~name, y = ~sum) %>%
add_bars() %>%
layout(title ="Distinct Titles per Genre",
autosize = T,
yaxis = list(title = '# of Distinct Movies', range = c(45, 80), showline = TRUE),
xaxis = list(type = 'category', title = 'Genre', categoryorder = "array", categoryarray = ~c("Sports", "Animation", "Action", "Sci-Fi", "Family", "Foreign", "Drama", "Documentary", "Games", "New", "Children", "Classics", "Comedy", "Horror", "Travel", "Music")))
Insight: Sports has the most many number of distinct titles and music has the least. Given this, Sport and Music might not be the most and least poular genres respectively.
# create adjusted rent count
rrc = merge(x=nt, y=dfg) %>% transmute(name, arc = count_genre / sum)
rrc %>%
plot_ly(x = ~name, y = ~arc, color = 'red') %>%
add_bars() %>%
layout(title ="Popularity of Genres (with Adjustment)",
autosize = T,
yaxis = list(title = '# of Rents', range = c(10, 18), showline = TRUE),
xaxis = list(type = 'category', title = 'Genre', categoryorder = "array", categoryarray = ~c("Sports", "Animation", "Action", "Sci-Fi", "Family", "Foreign", "Drama", "Documentary", "Games", "New", "Children", "Classics", "Comedy", "Horror", "Travel", "Music") ))
Insight: the plot above represents # of rents per genre divided by distinct titles per genre. It is observable that Sports is not as popular given its large number of distinct titles. Music is actually more popular than Sports in this respect.
# inventory count per film
icf = inventory %>% count(film_id)
# select need var
nn = merge(icf, cf) %>% select(name, n)
# investment by genre
ig = nn %>% group_by(name) %>%
summarise(sum = sum(n))
# get sales per genre
gs = genre_rented %>% select(name, total_sale)
git = merge(ig, gs)
# regression
ts_lm <- lm(total_sale ~ sum, data = git)
git %>%
plot_ly(x = ~sum, y = ~total_sale, color = ~name, hoverinfo = "text",
text = ~paste0("<b>", name, "</b> <br>",
"Sales: ", total_sale, "<br>",
"Inventory: ", sum)) %>%
#add_lines(y = ~fitted(ts_lm), name = "Regression Line") %>%
layout(title = 'Relationship between Investment and Sales (by genre)',
xaxis = list(title = 'Inventory', zeroline = TRUE),
yaxis = list(title = 'Sales per Genre'))
Insight: this plot shows whether each genre is generating enough sales proportionate to its inventory. The genre Childern and Classics are somewhat lagging behind, and Comedy is showing a good performance. However, there is no outlier probably because each genre contains many films.
# get only the needed variables
ra = payment %>% select(rental_id, amount)
ri = rental %>% select(rental_id, inventory_id)
fi = inventory %>% select(inventory_id, film_id)
ftr = film %>% select(film_id, title, rental_duration)
# sum sales by rental_id
rs = ra %>% group_by(rental_id) %>%
summarise(sum = sum(amount))
# get sales per inventory_id
spi = merge(x=rs,y=ri,by="rental_id", all.y=TRUE)
# na to 0
spi[is.na(spi)] = 0
# film_id and sum
fs = merge(x=spi, y=fi, by = 'inventory_id') %>% select(sum, film_id)
# inventory per film
ipf = fs %>% count(film_id)
# sales per film
spf = fs %>% group_by(film_id) %>% summarise(sales_per_film = sum(sum))
# sales and inventory of films
fsi = merge(spf, ipf, by='film_id')
tsnr = merge(fsi, ftr, by='film_id', all.y = TRUE) %>% select(title, sales_per_film, n, rental_duration)
tsnr = tsnr[!is.na(tsnr$n),]
tsnr = mutate(tsnr, ratio = sales_per_film / n)
bin = cut(tsnr$ratio, 3, labels = c('High', 'Mid', 'Low'), include.lowest = FALSE, right = TRUE, ordered_result = FALSE)
tsnr = cbind(tsnr, bin)
tsnr %>%
plot_ly(x = ~n, y = ~sales_per_film, color = bin, hoverinfo = "text",
text = ~paste0("<b>", title, "</b> <br>",
"Sales: ", sales_per_film, "<br>",
"Inventory: ", n, "<br>",
"Sales/Inventory: ", round(ratio, digit=2))) %>%
#add_lines(y = ~fitted(ts_lm), name = "Regression Line") %>%
layout(title = 'Relationship between Investment and Sales (by film)',
xaxis = list(title = 'Inventory', zeroline = TRUE),
yaxis = list(title = 'Sales per Film'))
Insight: this plot shows whether each film is generating enough sales with respect to its inventory. The three groups equally divides the lists of films according to their ratio of sales to invesment (inventory). By examining this table, we could know the what films need more inventory and what films the owner should not purchase more.
# select only the relevant variables
payment = payment %>% select(amount, payment_date)
payment$payment_date = format(as.Date(payment$payment_date), "%Y-%m")
# sales per month
spm = payment %>% group_by(payment_date) %>% summarise(sales_per_month = sum(amount))
plot_ly(opacity = 1) %>%
add_lines(x = ~spm$payment_date, y = ~spm$sales_per_month, name = "Sales Trend") %>%
layout(xaxis = list(title = 'Sales Amount'), yaxis = list(title = 'Time'))
Insight: although the time horizon for this dataset is not long enough to draw quality insight, trend analysis is a case worth practicing.